# visibility with slope vs low sf
library(tidyr)
libraray(dplyr)
younger =  read.csv("properties_infant.csv")
younger = younger[1:25000,] # full infant dataset has 25,691 images

# create the 2D visualization space
measure_gp1 = younger %>% select(edges_full, domOr)
pca1 <- prcomp(measure_gp1, center = TRUE,scale. = TRUE)
complexity = pca1$x[,1]
maxC = max(complexity)
younger$simplicity = maxC - complexity

measure_gp2 = younger %>% select(slope, contrast)
pca2 <- prcomp(measure_gp2, center = TRUE,scale. = TRUE) 
murkiness = pca2$x[,1]+5
maxM = max(murkiness)
younger$visibility = maxM - murkiness

# Measure # of properties above infant median
mv = median(younger$visibility)
ms = median(younger$simplicity)

idf = younger %>% 
  select("fileNames","edges_full","domOr","contrast","slope","threecluster","simplicity",
         "visibility")

medge = median(idf$edges_full)
mOr = median(idf$domOr, na.rm = TRUE)
mcon = median(idf$contrast, na.rm = TRUE)
mslp = median(idf$slope, na.rm = TRUE)

idf$highedges = 0
idf$highslope = 0
idf$highdomOr = 0
idf$highcontrast = 0
idf$nHigh = 0

for(i in 1:nrow(idf)){
  if(idf$edges_full[i]<medge){idf$highedges[i] = 1}
  if(idf$domOr[i]>mOr){idf$highdomOr[i] = 1}
  if(idf$contrast[i]>mcon){idf$highcontrast[i] = 1}
  if(idf$slope[i]>mslp){idf$highslope[i] = 1}
  idf$nHigh[i] = sum(idf$highedges[i],idf$highdomOr[i],
                         idf$highcontrast[i],idf$highslope[i])
}

younger = younger[order(younger$fileNames),] %>%
  mutate(HEVS = ifelse(visibility > mv & simplicity > ms,1,0))

# import filtered images
DF = read.csv("properties_filtered_gauss.csv") %>%
  unique()
DF = DF[order(DF$filename),]

idf2 = DF %>%
  select(c("fileNames","edges_full","domOr","contrast","slope")) %>%
  mutate(group = "1 to 3 post-filter")

idf2 = idf2[1:25000,] # trim to match image sample of adults & infant pre-filter

# Project PCA dimensions onto filtered data
filterc = predict(pca1, postmeas_gp1)
complexity = filterc[,1]
idf2$simplicity = maxC-complexity

postmeas_gp2 = idf2 %>% select(slope, contrast)
filterv = predict(pca2, postmeas_gp2)
murkiness = filterv[,1]+5
idf2$visibility = maxM - murkiness

idf$group = "1 to 3 pre-filter"
test = rbind(idf[,c(1,7,8,14)],idf2[,c(1,6,7,8)]) %>%
	mutate(simple = ifelse(simplicity > ms,1,0), visible = ifelse(visibility > mv,1,0))

# read in adults
adf = read.csv("X:/1C_LooxcieAdult/05_coding/datasets/properties_adults_4s_renamed.csv") %>%
	filter(edges_full>0)
fns = adf$fileNames

# Adults only have 24,775 images at 4s - sample the remaining 221 from 5s frequency  
extra_adf = read.csv("X:/1C_LooxcieAdult/05_coding/datasets/properties_adults_5s_renamed.csv") %>%
	mutate(not4 = ifelse(fileNames %in% fns,0,1)) %>%
	filter(not4 == 1 & edges_full>0) %>% 
	select(-not4)

25000-nrow(adf)
set.seed(23)
rand_df <- extra_adf[sample(nrow(extra_adf), size=225), ]

adf2 = rbind(adf,rand_df) %>%
  filter(edges_full>0) %>%
  select(c("fileNames","edges_full","domOr","contrast","SFslope","cluster")) %>%
  rename("slope" = "SFslope","threecluster" = "cluster")

# Project PCA dimensions onto adult data
filterc = predict(pca1, adf2[,c(2,3)]) # edges, domor
complexity = filterc[,1]
adf2$simplicity = maxC-complexity
filterv = predict(pca2, adf2[,c(5,4)]) # slope, contrast
murkiness = filterv[,1]+5
adf2$visibility = maxM - murkiness

# Measure # of properties above infant median
adf2$highedges = 0
adf2$highslope = 0
adf2$highdomOr = 0
adf2$highcontrast = 0

adf2$nHigh = 0

for(i in 1:nrow(adf2)){
  if(adf2$edges_full[i]<medge){adf2$highedges[i] = 1}
  if(adf2$domOr[i]>mOr){adf2$highdomOr[i] = 1}
  if(adf2$contrast[i]>mcon){adf2$highcontrast[i] = 1}
  if(adf2$slope[i]>mslp){adf2$highslope[i] = 1}
  adf2$nHigh[i] = sum(adf2$highedges[i],adf2$highdomOr[i],
                      adf2$highcontrast[i],adf2$highslope[i])
}

# cluster analyses for all 4 datasets
mi = younger %>% select("edges_full","domOr","contrast","slope")
mfi = idf2 %>% select("edges_full","domOr","contrast","slope")
ma = adf2 %>% select("edges_full","domOr","contrast","slope")
adf3 = read.csv("X:/1C_LooxcieAdult/05_coding/datasets/properties_adult_filtered_gauss.csv")

mfa = adf3 %>% 
  select("edges_full","domOr","contrast","slope")
mfa = mfa[complete.cases(mfa),]

library(class)
mfi$threecluster <- knn(mi,mfi,cl=younger$threecluster,k=3)

ma$threecluster <- knn(mi,ma,cl=younger$threecluster,k=3)

mfa$threecluster <- knn(mi,mfa,cl=younger$threecluster,k=3)

younger %>% group_by(threecluster) %>% summarise(n=n(), prop = n()/nrow(younger))
mfi %>% group_by(threecluster) %>% summarise(n=n(), prop = n()/nrow(mfi))
ma %>% group_by(threecluster) %>% summarise(n=n(), prop = n()/nrow(ma))
mfa %>% group_by(threecluster) %>% summarise(n=n(), prop = n()/nrow(mfa))

mi$threecluster = younger$threecluster
mi$group = "inf pre"
ma$group = "adu pre"
mfi$group = "inf post"
mfa$group = "adu post"

adu2inf = rbind(mi,ma)
iprepost = rbind(mi,mfi)
aprepost = rbind(ma,mfa)
postadu2inf = rbind(mi,mfa)
chisq.test(adu2inf$threecluster,adu2inf$group)
chisq.test(iprepost$threecluster,iprepost$group)
chisq.test(aprepost$threecluster,aprepost$group)
chisq.test(postadu2inf$threecluster,postadu2inf$group)

# RUN LENGTHS
##### RUNS
younger= younger%>% mutate(HEVS = ifelse(visibility > mv & simplicity > ms,1,0))

c1 = younger[younger$HEVS ==1,]
c1$event = 1
c1$event_row = 1
c1$run = 0
c1$imnum = 0
c1$subject = ""
for(i in 1:nrow(c1)){
  imnum = c1$fileNames[i]
  imnum1 = unlist(strsplit(imnum,"_"))
  imnum2 = substr(imnum1[2],1,6)
  c1$subject[i] = imnum1[1]
  c1$imnum[i]= as.numeric(imnum2)
  }

# step 2 - get runs - what's median run length?
ec = 1
for(i in 2:nrow(c1)){
  if(c1$imnum[i]== c1$imnum[i-1]+25){    
    c1$event_row[i] = c1$event_row[i-1] + 1
    c1$run[i] = 1
    c1$run[i-1] = 1}
  else{ ec = ec +1; c1$event_row[i] = 1}
  c1$event[i] = ec}

sum_y = c1 %>%
  group_by(event,subject) %>%
  summarise(run_length = max(event_row)) %>%
  filter(run_length > 1)

mean(sum_y$run_length)
median(sum_y$run_length)
max(sum_y$run_length)

adf5 = read.csv("X:/1C_LooxcieAdult/05_coding/datasets/properties_adults_5s_renamed.csv") %>%
  filter(edges_full>0) %>%
  select(c("fileNames","edges_full","domOr","contrast","SFslope","cluster")) %>%
  rename("slope" = "SFslope","threecluster" = "cluster")

filterc = predict(pca1, adf5[,c(2,3)]) # edges, domor
complexity = filterc[,1]
adf5$simplicity = maxC-complexity
filterv = predict(pca2, adf5[,c(5,4)]) # slope, contrast
murkiness = filterv[,1]+5
adf5$visibility = maxM - murkiness

adf5 = adf5 %>% mutate(HEVS = ifelse(visibility > mv & simplicity > ms,1,0))

c2 = adf5[adf5$HEVS ==1,]
c2$event = 1
c2$event_row = 1
c2$run = 0
c2$imnum = 0
c2$subject = ""
for(i in 1:nrow(c2)){
  imnum = c2$fileNames[i]
  imnum1 = unlist(strsplit(imnum,"_"))
  imnum2 = substr(imnum1[2],1,6)
  c2$subject[i] = imnum1[1]
  c2$imnum[i]= as.numeric(imnum2)
  }

# step 2 - get runs - what's median run length?
ec = 1
for(i in 2:nrow(c2)){
  if(c2$imnum[i] == c2$imnum[i-1]+25){    
    c2$event_row[i] = c2$event_row[i-1] + 1
    c2$run[i] = 1
    c2$run[i-1] = 1}
  else{ ec = ec +1; c2$event_row[i] = 1}
  c2$event[i] = ec}

sum_a = c2 %>%
  group_by(event, subject) %>%
  summarise(run_length = max(event_row)) %>%
  filter(run_length > 1)

mean(sum_a$run_length, na.rm = TRUE)
median(sum_a$run_length)
max(sum_a$run_length)

sum_a$group = "adult"
sum_y$group = "infant"
sum_both = rbind(sum_a,sum_y)

library(lmerTest)
model1 = lmer(run_length ~ group + (1 | subject), data = sum_both)

anova(model1)


# ALL PLOTS
# 2D space of 1-3 pre-filter
library(ggplot2)
ggplot(younger, aes(x = simplicity, y= visibility)) +
  geom_point(alpha = .3, size = .5)+
  geom_vline(xintercept = ms, colour="gold", linetype = "longdash")+
  geom_hline(yintercept = mv, colour="gold", linetype = "longdash")+
  ylab("Edge visibility")+
  xlab("Edge simplicity")+
  ylim(-.5,9.5)+
  theme_classic()

plotpath = "C:/Users/erinm/OneDrive/Documents/v1 paper/plots"
ggsave(file.path(plotpath,"infant_scatter_all.jpg"), plot = last_plot(),
  width = 1200, height = 1000, units = "px")

# 2D space of 1-3 where color indicates # of properties above median
library(RColorBrewer)

ggplot(idf, aes(x = simplicity, y = visibility, color = factor(nHigh) ))+
  geom_point(alpha = .3, size = .5)+
  geom_vline(xintercept = mv)+
  geom_hline(yintercept = ms)+
  scale_color_brewer(palette = "OrRd") +
  theme_classic()+
  ylab("Edge visibility")+
  xlab("Edge simplicity")
both$log_edge= log10(both$edges_full)

# Plot infant changes pre_post filter
iprepost = rbind(idf,idf2)
iprepost$group = factor(iprepost$group, levels = c("1 to 3 pre-filter","1 to 3 post-filter"))

ggplot(iprepost, aes(x = slope, y= ..count.., fill = group)) +
  geom_density(alpha = .5)+
  theme_classic()+
  ylab("Density")+
  scale_fill_manual(values = c("gray10","gray70"))

ggplot(iprepost, aes(x = simplicity, y= ..count.., fill = group)) +
  geom_density(alpha = .5)+
  theme_classic()+
  xlab("simplicity")+
  ylab("Density")+
  scale_fill_manual(values = c("gray70","gray10"))

ggplot(iprepost, aes(x = visibility, y= ..count.., fill = group)) +
  geom_density(alpha = .5)+
  theme_classic()+
  xlab("visibility")+
  ylab("Density")+
  scale_fill_manual(values = c("gray70","gray10"))

ggplot(iprepost, aes(x = simplicity, y= visibility, color = group)) +
  geom_point(alpha = .5,size = .3)+
  theme_classic()+
  geom_hline(yintercept = mv) +
  geom_vline(xintercept = ms) +
  ylab("visibility index")+
  xlab("simplicity index")+
  scale_color_manual(values = c("gray10","gray70"))

# Plot adults in 2D space
ggplot(adf2, aes(x = simplicity, y= visibility)) +
  geom_point(alpha = .3, size = .5)+
  geom_vline(xintercept = ms, colour="gold", linetype = "longdash")+
  geom_hline(yintercept = mv, colour="gold", linetype = "longdash")+
  ylab("Edge visibility")+
  xlab("Edge simplicity")+
  ylim(-.5,9.5)
  
idf$group = "1 to 3 months"
adf2$group = "adult"
adult_inf = rbind(idf,adf2) %>%
	mutate(HEVS = ifelse(visibility > mv & simplicity > ms,1,0))

#adult_inf$group = factor(adult_inf$group, levels = c("adult","1 to 3 months"))

ggplot(adf2, aes(x = domOr))+
  geom_density(alpha = .8, fill = "#E6A0C4")+
  theme_classic()+
  xlim(0.06,.17)

idf$log_edge = log10(idf$edges_full)
adf2$log_edge = log10(adf2$edges_full)
adult_inf$log_edge = log10(adult_inf$edges_full)

# Adult & infant overlaid proerties
ggplot(adult_inf, aes(x = log_edge, fill = group))+
  geom_density(alpha = .8)+
  geom_vline(xintercept = median(idf$log_edge), linetype = "longdash") +
  geom_vline(xintercept = median(adf2$log_edge)) +
  theme_classic()+
  scale_fill_manual(values = c("#7294D4","#E6A0C4"))

ggplot(adult_inf, aes(x = domOr, fill = group))+
  geom_density(alpha = .8)+
  geom_vline(xintercept = median(idf$domOr), linetype = "longdash") +
  geom_vline(xintercept = median(adf2$domOr)) +
  theme_classic()+
  scale_fill_manual(values = c("#7294D4","#E6A0C4"))

ggplot(adult_inf, aes(x = contrast, fill = group))+
  geom_density(alpha = .8)+
  geom_vline(xintercept = median(idf$contrast), linetype = "longdash") +
  geom_vline(xintercept = median(adf2$contrast)) +
  theme_classic()+
  scale_fill_manual(values = c("#7294D4","#E6A0C4"))

ggplot(adult_inf, aes(x = slope, fill = group))+
  geom_density(alpha = .8)+
  geom_vline(xintercept = median(idf$slope), linetype = "longdash") +
  geom_vline(xintercept = median(adf2$slope)) +
  theme_classic()+
  scale_fill_manual(values = c("#7294D4","#E6A0C4"))

# 2D space with adults and infants, where color is # of properties
ggplot(adult_inf, aes(x = simplicity, y = visibility, color = nHigh))+
  geom_point(alpha = .3, size = 0.7)+
  theme_classic()+
  scale_color_gradient(low = "grey10", high = "grey80")+
  ylab("visibility index")+
  xlab("simplicity index")+
  facet_wrap(~group)

# 2D space with adults and infants, where color is # cluster group
ggplot(adult_inf, aes(x = simplicity, y = visibility, color = factor(threecluster))) +
  geom_point(alpha = .3, size = .7)+
  theme_classic()+
  ylab("visibility index")+
  xlab("simplicity index")+
  geom_hline(yintercept = mv, linetype = "longdash") +
  geom_vline(xintercept = ms, linetype = "longdash") +
  facet_wrap(~group)

test = adult_inf %>% 
	mutate(simple = ifelse(threecluster == 1, 1,0))

test = adult_inf %>% 
	mutate(quad = ifelse(simplicity > ms & visibility > mv, "UR",
			ifelse(simplicity > ms & visibility <= mv, "LR",
				ifelse(simplicity <= ms & visibility > mv, "UL", "LL"))))

chisq.test(test$quad,test$group)
chisq.test(test$simple,test$group)
chisq.test(adult_inf$threecluster,adult_inf$group)
chisq.test(adult_inf$nHigh,adult_inf$group)